PREPROCESSING OF DATA

Load the necessary packages.

library(corrplot)
library(dplyr)
library(ggplot2)
library(DescTools)
library(cluster)
df <- read.csv("imputedResponses.csv",na.strings=c(""," ","NA"))
is.finite.data.frame <- function(obj){
    sapply(obj,FUN = function(x) all(is.finite(x)))
}
df <- df[rowSums(is.na(df)) == 0,]

We understand that body mass index is a better measure compared to height and weight individually,hence we replace the individual measures with the combined measure. We have established that the height is in cms and weight in kgs.

bmi = function(height,weight){
  return(weight/(height/100)^2)
}
df$bmi = bmi(df$Height,df$Weight)

hist(df$bmi,col = "blue",breaks = 100,xlim = c(12,60),main="Histogram For BMI",xlab='BMI')

# Since BMI is the numerical value of a scale different from other variables we recode the data to the same scale.
# 1,2,3,4,5 being underweight,fit,healthy,overweight and obese respectively.
df$bmi[df$bmi <= 18.5] = 1
df$bmi[df$bmi > 18.5 & df$bmi <= 20] = 2
df$bmi[df$bmi > 20 & df$bmi <= 25] = 3
df$bmi[df$bmi > 25 & df$bmi <= 30] = 4
df$bmi[df$bmi > 30] = 5

nreqAttributes <- names(df) %in% c("Height","Weight")
df <- df[!nreqAttributes]

FEATURE EXTRACTION

The process of feature extraction involves analyzing the data set to find the variables that should be used to classify, finding the relevant features to be used for classification. We study the personality traits group of our data set and extract the features as happinessSadness

happinessFactors <- c("Hypochondria","Loneliness","Dreams","Number.of.friends","Mood.swings",
                      "Getting.angry","Life.struggles","Happiness.in.life","Energy.levels","Personality")
happinessSadness <- df[happinessFactors]

# We consider the above mentoned variables as the factors of happiness and sadness.From their correlation 
# plot we can infer that none of the variables under study are very highly correlated.So,we use these ten 
# factors across sections of our dataset to find the variables which effect these factors the most and in 
# the end effect the happiness / sadness of people.
#print("Distribution of Happiness Factors")

Distribution of Happiness Factors

par(mfrow=c(2,5),mar=c(2,2,2,2))
for (factorV in happinessFactors){
  hist(happinessSadness[[factorV]],breaks = c(0,1,2,3,4,5),freq = FALSE,col="#3399FF",main="",mgp=c(1,0,0),xlab=factorV)
}

#mtext("Distribution of Happiness factors",side=1,line=-32,adj=1,padj=0)

We need to verify our assumption,as to our these factors actually determinants of someone’s happiness in life. We use logistic regression to predict happiness in life using the other variables.

LOGISTIC REGRESSION

formulaStr <- paste(names(happinessSadness[c(1:7,9,10)]), collapse='+')
formulaStr <- paste("Happiness.in.life ~",formulaStr)
# as.formula(formulaStr)
logitModel <- glm( as.formula(formulaStr),data = happinessSadness, family = "poisson", maxit = 100)
print(summary(logitModel))
## 
## Call:
## glm(formula = as.formula(formulaStr), family = "poisson", data = happinessSadness, 
##     maxit = 100)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.69487  -0.24435   0.01976   0.23531   0.91807  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        0.979392   0.161337   6.070 1.28e-09 ***
## Hypochondria       0.010196   0.015036   0.678  0.49768    
## Loneliness        -0.054791   0.016840  -3.254  0.00114 ** 
## Dreams             0.029316   0.025125   1.167  0.24328    
## Number.of.friends  0.018341   0.017892   1.025  0.30530    
## Mood.swings       -0.011191   0.018429  -0.607  0.54369    
## Getting.angry     -0.001548   0.015445  -0.100  0.92017    
## Life.struggles    -0.004702   0.012685  -0.371  0.71086    
## Energy.levels      0.060863   0.019636   3.100  0.00194 ** 
## Personality        0.041341   0.028409   1.455  0.14561    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 197.78  on 985  degrees of freedom
## Residual deviance: 135.55  on 976  degrees of freedom
## AIC: 3276.3
## 
## Number of Fisher Scoring iterations: 4
tempTestData <- df[sample(1:nrow(df),300,replace = TRUE),]
tempTestData <- tempTestData[happinessFactors]
predictedV <- predict(logitModel,tempTestData)
# polr(as.formula(formulaStr),happinessSadness)

plot(logitModel$residuals,logitModel$fitted.values,main="Plot of Fitted vs Residual values",xlab="Residuals",ylab="Fitted values")

# Function for Root Mean Squared Error
RMSE <- function(error) { sqrt(mean(error^2)) }
RMSE(logitModel$residuals)
## [1] 0.1947679
# If you want, say, MAE, you can do the following:

# Function for Mean Absolute Error
mae <- function(error) { mean(abs(error)) }
mae(logitModel$residuals)
## [1] 0.1500316

Principle Component Analysis

We perform Principal Component Analysis on the extracted features to obtain an independent variable which represents how Happy someone really is.

modify <- function(x) 5-x
modifiedHappinessSadness <- data.frame(happinessSadness[,c('Dreams','Number.of.friends','Happiness.in.life','Energy.levels','Personality')], lapply(happinessSadness[,c('Hypochondria','Loneliness','Mood.swings','Getting.angry','Life.struggles')], modify) )

pca <- prcomp(modifiedHappinessSadness)
pcaHappinessSadness = as.data.frame(pca$x[,1])
Happy <- vector(length = 986)
modifiedHappinessSadness = cbind(modifiedHappinessSadness,pcaHappinessSadness,Happy)
colnames(modifiedHappinessSadness)[11] <- "pcaHappinessSadness"
pcaCorrelation = cor(happinessSadness,pca$x[,1:4])
corrplot(pcaCorrelation,title="Correlation of principle components with happiness sadness factors",mar=c(0,0,2,0))

modifiedHappinessSadness$Happy[modifiedHappinessSadness$pcaHappinessSadness < 0] = "FALSE"
modifiedHappinessSadness$Happy[modifiedHappinessSadness$pcaHappinessSadness > 0] = "TRUE"
df$Happy <- modifiedHappinessSadness$Happy
happinessCount <- table(df$Happy,df$Gender)
barplot(happinessCount, main = paste("Happy vs Gender"), col = c("red","blue"))
legend("topright",legend = rownames(happinessCount),fill = c("red","blue") ,ncol = 1,cex = 0.4)

This function is used for finding contribution of the relevant attributes on principle components.

# Helper function 
varCoordFunc <- function(loadings, comp.sdev){
  loadings*comp.sdev
}
# Compute Coordinates
loadings <- pca$rotation
sdev <- pca$sdev
varCoord <- t(apply(loadings, 1, varCoordFunc, sdev))

# Compute Cos2
varCos2 <- varCoord^2

# Compute contributions
compCos2 <- apply(varCos2, 2, sum)
contrib <- function(varCos2, compCos2){varCos2*100/compCos2}
varContrib <- t(apply(varCos2,1, contrib, compCos2))
varContrib[,1:2]
##                         PC1          PC2
## Dreams             1.061990  0.778612578
## Number.of.friends  6.966440 17.987340141
## Happiness.in.life  6.750761  5.479806853
## Energy.levels      9.497671 14.810048896
## Personality        2.210171  1.061324333
## Hypochondria       6.691236  0.007977579
## Loneliness        18.781307  8.316843802
## Mood.swings       14.549957  0.810755366
## Getting.angry     10.596535 10.256665877
## Life.struggles    22.893931 40.490624574

We obtain relevant attributes by observing the correlation values. We compute the correlation between these attributes by using Goodman and Kruskal’s gamma method.

findCorrelation function-This is the function used to find rank correlation of various attributes in the data set with respect to a predictor variable like “Happy” or “Happiness in life” using Goodman and Kruskal’s gamma method. We compare values of each attribute with the predictor variable and consider only those attributes which give a gamma value<-0.25 or gamma value>0.25.

findCorrelation <- function(workingData,predictorVariable){
  corrVals <- list()
i <- 1
for(variable in colnames(workingData)){
  # print(variable)
  corrVals[i] <- GoodmanKruskalGamma(workingData[[variable]],predictorVariable)
  i <- i + 1
}
corrVals <- as.numeric(corrVals)
setNames(corrVals,colnames(workingData))
corrVals <- data.frame(corrVals)

corrVals$Attributes = as.vector(colnames(workingData))
corrValsModified <- corrVals
corrValsModified$Attributes <- factor(corrValsModified$Attributes, levels = corrValsModified$Attributes[order(-corrValsModified$corrVals)])
relevantTraits <- corrValsModified[(corrValsModified$corrVals >= 0.25 | corrValsModified$corrVals <= -0.25),]
p <- ggplot(relevantTraits, aes(x=Attributes,y=corrVals,fill = Attributes)) + geom_bar(stat="identity") + scale_fill_hue() + coord_flip()+ggtitle("Relevant Attributes")
print(p)
print(relevantTraits$Attributes)
return(relevantTraits)
}

We encode happiness value “TRUE” as 1 and happiness value “FALSE” as 0.

# Analysis of various traits wrt Happy Label.
df$Happy[df$Happy == TRUE] = 1
df$Happy[df$Happy == FALSE] = 0
df$Happy = as.numeric(df$Happy)
workingData <- df
nreqVariables = names(workingData) %in% happinessFactors
relevantAttributes1 <- findCorrelation(workingData = workingData[!nreqVariables],workingData$Happy)

##  [1] Gender                  Health                 
##  [3] Changing.the.past       New.environment        
##  [5] Public.speaking         Interests.or.hobbies   
##  [7] Storm                   Darkness               
##  [9] Spiders                 Fear.of.public.speaking
## [11] Cars                    Active.sport           
## [13] Adrenaline.sports       Romantic               
## [15] Western                 Action                 
## [17] bmi                     Happy                  
## 141 Levels: Happy Gender Cars New.environment ... Darkness
relevantAttributes2 <- findCorrelation(workingData = workingData[!(names(workingData) %in% c('Happy','Happiness.in.life'))],workingData$Happiness.in.life)

## [1] Loneliness           Changing.the.past    Dreams              
## [4] Number.of.friends    Mood.swings          Energy.levels       
## [7] Personality          Interests.or.hobbies Fun.with.friends    
## 149 Levels: Energy.levels Personality Number.of.friends ... Loneliness
library(clue)
library(factoextra)
library(caret)
library(scales)

rawData1 has the attributes having correlation magnitude >0.25 with respect “Happy” variable. trainData1 has 80% of rawData1 which is used for training models. testData1 has 20% of rawData1 which is used for predicting based on the trained models. resultsDF dataframe has the predicted values of different clustering techniques along with “Happy” values of Test data used for comparison of accuracy.

# Preparing Data for clustering,scaling all the attributes to 1-5,omitting categorical data.
rawData1 <- na.omit(df[as.vector(relevantAttributes1$Attributes)])
rawData1$Happy <- na.omit(df$Happy)
nreqAttributes <- names(rawData1) %in% c("bmi","Gender")
rawData1 <- rawData1[!nreqAttributes]
# Scale Age weight and height.
rawData1$Age <- rescale(rawData1$Age, to = c(1, 5), from = range(rawData1$Age))
nrowsTrain <- 0.8*nrow(rawData1)
trainData1 <- rawData1[1:nrowsTrain,]
actualData1 <- df[nrowsTrain:nrow(df),]
testData1 <- rawData1[nrowsTrain:nrow(rawData1),]
# resultsDF <- data.frame(matrix(NA,nrow = nrow(testData1)))
resultsDF <- data.frame(testData1$Happy)
columnNames <- colnames(rawData1)
resultsList = list()

Visualization of Relevant attributes

par(mfrow=c(2,5),mar=c(2,2,2,2))
for (factorV in columnNames[1:15]){
  hist(trainData1[[factorV]],breaks = c(0,1,2,3,4,5),freq = FALSE,col="#3399FF",main="",mgp=c(1,0,0),xlab=factorV)
}

K-MEANS CLUSTERING

k-means clustering aims to partition the observations into k clusters in which each observation belongs to the cluster with the nearest mean. This method uses 2-norm distance metric(Euclidean distance) for classifying the observations into clusters. We consider k=4.

#K-Means clustering for the data
fviz_nbclust(trainData1, kmeans, method = "wss") +
  geom_vline(xintercept = 3, linetype = 2)

set.seed(123)
km.res <- kmeans(trainData1, 3, nstart = 25,iter.max = 10000)
km.res$centers
##     Health Changing.the.past New.environment Public.speaking
## 1 3.026756          2.886288        3.889632        3.076923
## 2 3.290456          3.016598        3.128631        3.680498
## 3 3.576613          3.104839        3.334677        4.028226
##   Interests.or.hobbies    Storm Darkness  Spiders Fear.of.public.speaking
## 1             4.003344 1.525084 1.625418 2.170569                2.367893
## 2             3.078838 1.709544 1.941909 2.278008                2.879668
## 3             3.443548 2.919355 3.366935 4.258065                3.185484
##       Cars Active.sport Adrenaline.sports Romantic  Western   Action
## 1 3.541806     4.280936          4.036789 3.063545 2.581940 4.113712
## 2 2.033195     2.070539          1.917012 3.302905 1.933610 3.215768
## 3 2.225806     3.334677          2.737903 4.201613 1.693548 3.116935
##       Happy
## 1 0.7123746
## 2 0.3941909
## 3 0.3306452
fviz_cluster(km.res, data = trainData1,main="K-means cluster plot wrt Happy factor with k=3")

kmeansCluster <- trainData1
kmeansCluster$clusterNo <- km.res$cluster
clusplot(kmeansCluster[c("Adrenaline.sports","Active.sport","Health")],kmeansCluster$clusterNo,main="Effect of active engagement in activities on happiness")

clusplot(kmeansCluster[c("Changing.the.past","Spiders","Storm","Darkness")],kmeansCluster$clusterNo,main = "Negative Impactors of Happy Clustered together")

We plot the clusters obtained after clustering.

library(gridExtra)
for(factorV in colnames(trainData1)){
tempdf <- kmeansCluster %>%
    group_by(clusterNo,!!sym(factorV)) %>%
    summarise(counts = n())
  
  p <- ggplot(tempdf, aes(fill=!!sym(factorV), y = counts,x=clusterNo)) +
      geom_bar(position="dodge", stat="identity",width = 0.3)+
    labs(title=paste("Cluster Analysis for",factorV), x="Cluster No", y="No of people belonging to the cluster") + 
    theme(plot.title = element_text(size=16))
  print(p)
}

# grid.arrange(p,nrow = 3,ncol = 5)

We predict the clusters for test data based on the k-means model trained and then compare the accuracy of the prediction by comparing with actual data.

predictedCluster <- as.vector(cl_predict(km.res,testData1))
resultsDF$PredictedKmean = predictedCluster
resultsDF$PredictedKmean[resultsDF$PredictedKmean == 3| resultsDF$PredictedKmean == 2] = 0
resultsDF$PredictedKmean[resultsDF$PredictedKmean == 1] = 1
resultsList$Kmeans <- confusionMatrix(factor(actualData1$Happy,levels = 0:1),factor(resultsDF$PredictedKmean,levels = 0:1),positive = '1',mode="prec_recall")
print(resultsList$Kmeans)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 70 20
##          1 43 65
##                                         
##                Accuracy : 0.6818        
##                  95% CI : (0.612, 0.746)
##     No Information Rate : 0.5707        
##     P-Value [Acc > NIR] : 0.0008779     
##                                         
##                   Kappa : 0.3717        
##  Mcnemar's Test P-Value : 0.0055758     
##                                         
##               Precision : 0.6019        
##                  Recall : 0.7647        
##                      F1 : 0.6736        
##              Prevalence : 0.4293        
##          Detection Rate : 0.3283        
##    Detection Prevalence : 0.5455        
##       Balanced Accuracy : 0.6921        
##                                         
##        'Positive' Class : 1             
## 

Method 2 of K-means clustering for “Happiness in life”

rawData2 <- na.omit(df[as.vector(relevantAttributes2$Attributes)])
rawData2$Happiness.in.life <- na.omit(df$Happiness.in.life)
rawData2$happinessModified = rawData2$Happiness.in.life
rawData2$happinessModified[rawData2$happinessModified<=3] = 0
rawData2$happinessModified[rawData2$happinessModified>3] = 1
#rawData2[-c(10)]
nrowsTrain <- 0.9*nrow(rawData2)
trainData2 <- rawData2[1:nrowsTrain,]
actualData <- df[nrowsTrain:nrow(df),]
testData2 <- rawData2[nrowsTrain:nrow(rawData2),]
fviz_nbclust(trainData2, kmeans, method = "wss") +
  geom_vline(xintercept = 3, linetype = 2)

set.seed(123)
km.res <- kmeans(trainData2, 3, nstart = 25)
tempCenters <- as.data.frame(km.res$centers)
print(tempCenters)
##   Loneliness Changing.the.past   Dreams Number.of.friends Mood.swings
## 1   3.529210          3.333333 3.130584          2.587629    3.611684
## 2   3.049808          3.938697 3.279693          3.739464    3.532567
## 3   2.164179          1.874627 3.489552          3.737313    2.737313
##   Energy.levels Personality Interests.or.hobbies Fun.with.friends
## 1      2.786942    2.996564             2.597938         4.219931
## 2      4.061303    3.394636             4.049808         4.727969
## 3      4.125373    3.483582             4.026866         4.704478
##   Happiness.in.life happinessModified
## 1          3.151203         0.3608247
## 2          3.685824         0.6398467
## 3          4.194030         0.9014925
fviz_cluster(km.res, data = trainData2,main="K-means cluster plot wrt Happiness in life with k=3")

predictedCluster <- as.vector(cl_predict(km.res,testData2))
testData2$PredictedKmean2 = predictedCluster
testData2$PredictedKmean[testData2$PredictedKmean == 5] = 2
testData2$PredictedKmean[testData2$PredictedKmean == 3 ] = 3
testData2$PredictedKmean[testData2$PredictedKmean == 4 | testData2$PredictedKmean == 1| testData2$PredictedKmean == 7|testData2$PredictedKmean == 2 ] = 5
confusionMatrix(factor(actualData$Happiness.in.life,levels = 1:5),factor(testData2$PredictedKmean,levels = 1:5),mode="prec_recall")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  1  2  3  4  5
##          1  0  0  0  0  0
##          2  0  0  0  0  4
##          3  0  0  4  0 28
##          4  0  0 24  0 26
##          5  0  0 11  0  2
## 
## Overall Statistics
##                                           
##                Accuracy : 0.0606          
##                  95% CI : (0.0226, 0.1273)
##     No Information Rate : 0.6061          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : -0.1845         
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Precision                  NA   0.0000   0.1250   0.0000  0.15385
## Recall                     NA       NA   0.1026       NA  0.03333
## F1                         NA       NA   0.1127       NA  0.05479
## Prevalence                  0   0.0000   0.3939   0.0000  0.60606
## Detection Rate              0   0.0000   0.0404   0.0000  0.02020
## Detection Prevalence        0   0.0404   0.3232   0.5051  0.13131
## Balanced Accuracy          NA       NA   0.3179       NA  0.37564
testData2$PredictedKmean2[testData2$PredictedKmean2 == 1 | testData2$PredictedKmean2 == 2 ] = 0
testData2$PredictedKmean2[testData2$PredictedKmean2 == 3] = 1
confusionMatrix(factor(actualData$Happiness.in.life,levels = 1:5),factor(testData2$PredictedKmean,levels = 1:5),mode="prec_recall")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  1  2  3  4  5
##          1  0  0  0  0  0
##          2  0  0  0  0  4
##          3  0  0  4  0 28
##          4  0  0 24  0 26
##          5  0  0 11  0  2
## 
## Overall Statistics
##                                           
##                Accuracy : 0.0606          
##                  95% CI : (0.0226, 0.1273)
##     No Information Rate : 0.6061          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : -0.1845         
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Precision                  NA   0.0000   0.1250   0.0000  0.15385
## Recall                     NA       NA   0.1026       NA  0.03333
## F1                         NA       NA   0.1127       NA  0.05479
## Prevalence                  0   0.0000   0.3939   0.0000  0.60606
## Detection Rate              0   0.0000   0.0404   0.0000  0.02020
## Detection Prevalence        0   0.0404   0.3232   0.5051  0.13131
## Balanced Accuracy          NA       NA   0.3179       NA  0.37564

HIERARCHICAL CLUSTERING

Hierarchical clustering is an approach for identifying clustering in the complete rawData1 dataset by using pairwise distance matrix between observations as clustering criteria. We perform hierarchical clustering using k=2 i.e classifying into 2 clusters. We have used “complete” method of hierarchical clustering.

#Hierarchical clustering for all the relevant attributes
##For k=2
library(dendextend)
library(colorspace)
distCalc <- dist(rawData1)
#Using the "complete" method for clustering as it gives better accuracy
hclusters <- hclust(distCalc, method = "complete")
happyLevels <- rev(levels(as.factor(rawData1$Happy)))

#Forming the dendrogram
dend <- as.dendrogram(hclusters)
dend <- rotate(dend, 1:150)
dend <- color_branches(dend, k=2)
labels_colors(dend) <-
  rainbow_hcl(2)[sort_levels_values(
    as.numeric(rawData1$Happy)[order.dendrogram(dend)]
  )]

labels(dend) <- paste(as.character(rawData1$Happy)[order.dendrogram(dend)],"(",labels(dend),")", sep = "")
dend <- hang.dendrogram(dend,hang_height=0.1)
dend <- set(dend, "labels_cex", 0.5)

#Plotting the dendrogram
par(mfrow=c(1,1))
plot(dend, 
     main = "Hierarchical clustering of relevant attributes", 
     horiz =  TRUE,  nodePar = list(cex = .007))
legend("topleft", legend = happyLevels, fill = rainbow_hcl(2))

hclusteringMethod <- c("complete")
hclustDendlist <- dendlist()
for(i in seq_along(hclusteringMethod)) {
  hCluster <- hclust(distCalc , method = hclusteringMethod[i])
  hclustDendlist <- dendlist(hclustDendlist, as.dendrogram(hCluster))
}
hclustDendlist
## [[1]]
## 'dendrogram' with 2 branches and 986 members total, at height 13.56466 
## 
## attr(,"class")
## [1] "dendlist"
getClusters <- function(dend) {
  cutree(dend, k =2)[order.dendrogram(dend)]
}
dendClusters <- lapply(hclustDendlist, getClusters)
dendClusters<-as.data.frame(dendClusters)
colnames(dendClusters)[1]<-"predicted.clusters"
modifiedDendClusters<-dendClusters
for (i in 1:nrow(modifiedDendClusters))
{
  #If it is cluster 1,Happy value =0
  if(modifiedDendClusters$predicted.clusters[i]==1)
    modifiedDendClusters$predicted.clusters[i]<-0
  else
    #If it is cluster 2,Happy value =1
    modifiedDendClusters$predicted.clusters[i]<-1
}
#For the confusion Matrix
referenceData<-rawData1$Happy
predictedData<-modifiedDendClusters$predicted.clusters
unionData <- union(predictedData,referenceData)
tableData <- table(factor(predictedData, unionData), factor(referenceData, unionData))
confusionMatrix(tableData,positive = '1',mode="prec_recall")
## Confusion Matrix and Statistics
## 
##    
##       0   1
##   0 215 199
##   1 273 299
##                                           
##                Accuracy : 0.5213          
##                  95% CI : (0.4896, 0.5529)
##     No Information Rate : 0.5051          
##     P-Value [Acc > NIR] : 0.1617510       
##                                           
##                   Kappa : 0.041           
##  Mcnemar's Test P-Value : 0.0007792       
##                                           
##               Precision : 0.5227          
##                  Recall : 0.6004          
##                      F1 : 0.5589          
##              Prevalence : 0.5051          
##          Detection Rate : 0.3032          
##    Detection Prevalence : 0.5801          
##       Balanced Accuracy : 0.5205          
##                                           
##        'Positive' Class : 1               
## 

Hierarchical clustering and prediction on Test data.

Here also we classify into 2 clusters by taking k=2.

#Hierarchical clustering and prediction using test data
library(dendextend)
library(colorspace)
distCalcTest <- dist(testData1)
#Using the "ward.D" method for clustering as it gives better accuracy
hclustersTest <- hclust(distCalcTest, method = "ward.D")
happyLevelsTest <- rev(levels(as.factor(testData1$Happy)))

#Forming the dendrogram
dendTest <- as.dendrogram(hclustersTest)
dendTest <- rotate(dendTest, 1:150)
dendTest <- color_branches(dendTest, k=2)
labels_colors(dendTest) <-
  rainbow_hcl(2)[sort_levels_values(
    as.numeric(testData1$Happy)[order.dendrogram(dendTest)]
  )]

labels(dend) <- paste(as.character(testData1$Happy)[order.dendrogram(dendTest)],"(",labels(dendTest),")", sep = "")
dendTest <- hang.dendrogram(dendTest,hang_height=0.1)
dendTest <- set(dendTest, "labels_cex", 0.5)

#Plotting the dendrogram
par(mfrow=c(1,1))
plot(dendTest, 
     main = "Hierarchical Clustering on Test Data", 
     horiz =  TRUE,  nodePar = list(cex = .007))
legend("topleft", legend = happyLevelsTest, fill = rainbow_hcl(2))

We use the “ward.D” method of hierarchical clustering.

hclusteringMethodTest <- c("ward.D")
hclustDendlistTest <- dendlist()
for(i in seq_along(hclusteringMethodTest)) {
  hClusterTest <- hclust(distCalcTest , method = hclusteringMethodTest[i])
  hclustDendlistTest <- dendlist(hclustDendlistTest, as.dendrogram(hClusterTest))
}
#hclustDendlistTest

getClustersTest <- function(dendTest) {
  cutree(dendTest, k =2)[order.dendrogram(dendTest)]
}
dendClustersTest <- lapply(hclustDendlistTest, getClustersTest)
dendClustersTest<-as.data.frame(dendClustersTest)

colnames(dendClustersTest)[1]<-"predictedHierarchical"

modifiedDendClustersTest<-dendClustersTest
for (i in 1:nrow(modifiedDendClustersTest))
{
  #If it is cluster 1,Happy value = 0
  if(modifiedDendClustersTest$predictedHierarchical[i]==1)
    modifiedDendClustersTest$predictedHierarchical[i]<-0
  else
    #If it is cluster 2,Happy value = 1
    modifiedDendClustersTest$predictedHierarchical[i]<-1
}
resultsDF<-cbind(resultsDF,modifiedDendClustersTest)

We compare the accuracy of the predicted and actual values obtained by hierarchical clustering.

#For the confusion Matrix
referenceDataTest<-testData1$Happy
predictedDataHierarchicalTest<-resultsDF$predictedHierarchical
unionDataHierarchicalTest <- union(predictedDataHierarchicalTest,referenceDataTest)
tableDataHierarchicalTest <- table(factor(predictedDataHierarchicalTest, unionDataHierarchicalTest), factor(referenceDataTest, unionDataHierarchicalTest))
resultsList$Hierarchical <- confusionMatrix(tableDataHierarchicalTest,positive = '1',mode="prec_recall")
print(resultsList$Hierarchical)
## Confusion Matrix and Statistics
## 
##    
##      1  0
##   1 68 33
##   0 40 57
##                                         
##                Accuracy : 0.6313        
##                  95% CI : (0.56, 0.6986)
##     No Information Rate : 0.5455        
##     P-Value [Acc > NIR] : 0.008903      
##                                         
##                   Kappa : 0.2613        
##  Mcnemar's Test P-Value : 0.482525      
##                                         
##               Precision : 0.6733        
##                  Recall : 0.6296        
##                      F1 : 0.6507        
##              Prevalence : 0.5455        
##          Detection Rate : 0.3434        
##    Detection Prevalence : 0.5101        
##       Balanced Accuracy : 0.6315        
##                                         
##        'Positive' Class : 1             
## 

K-MEDOID CLUSTERING

It is a partitional algorithm related to K-means and medoid shifting algorithms which uses medoids i.e., the points in the data set as centres We have partitioned into six clusters.

#k-medoid Clustering

library(cluster)

set.seed(123)
pam.res <- pam(trainData1, 6)
pam.res$medoids
##     Health Changing.the.past New.environment Public.speaking
## 570      3                 3               4               2
## 662      3                 3               4               4
## 765      4                 4               2               5
## 600      3                 3               3               3
## 30       4                 3               3               5
## 196      3                 4               3               3
##     Interests.or.hobbies Storm Darkness Spiders Fear.of.public.speaking
## 570                    4     2        3       2                       2
## 662                    3     2        2       2                       3
## 765                    3     2        2       5                       4
## 600                    4     1        2       2                       3
## 30                     4     3        4       5                       3
## 196                    3     2        3       5                       3
##     Cars Active.sport Adrenaline.sports Romantic Western Action Happy
## 570    2            5                 3        5       2      3     1
## 662    3            2                 2        3       1      3     1
## 765    1            2                 1        4       1      3     0
## 600    4            4                 4        3       3      5     1
## 30     2            4                 3        5       2      2     0
## 196    4            4                 3        4       2      4     0
fviz_cluster(pam.res, data = trainData1,main="K-Medoid Clustering with 6 clusters")

predictedClusterPam <- as.vector(cl_predict(pam.res,testData1))
resultsDF$PredictedPam = predictedClusterPam
resultsDF$PredictedPam[resultsDF$PredictedPam == 3 |resultsDF$PredictedPam == 5 | resultsDF$PredictedPam == 6] = 0
resultsDF$PredictedPam[resultsDF$PredictedPam == 1| resultsDF$PredictedPam == 2 | resultsDF$PredictedPam == 4] = 1
resultsList$KMedoid = confusionMatrix(factor(resultsDF$PredictedPam,levels = 0:1),factor(testData1$Happy,levels = 0:1),positive = '1',mode="prec_recall")
print(resultsList$KMedoid)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 45 19
##          1 45 89
##                                           
##                Accuracy : 0.6768          
##                  95% CI : (0.6068, 0.7413)
##     No Information Rate : 0.5455          
##     P-Value [Acc > NIR] : 0.0001136       
##                                           
##                   Kappa : 0.3321          
##  Mcnemar's Test P-Value : 0.0017781       
##                                           
##               Precision : 0.6642          
##                  Recall : 0.8241          
##                      F1 : 0.7355          
##              Prevalence : 0.5455          
##          Detection Rate : 0.4495          
##    Detection Prevalence : 0.6768          
##       Balanced Accuracy : 0.6620          
##                                           
##        'Positive' Class : 1               
## 

K-MEDIANS CLUSTERING

k-medians clustering partitions the observations into k clusters in which each observation belongs to the cluster with the nearest median. This method uses 1-norm distance metric(Manhattan distance) for classifying the observations into clusters. We consider k=4 and perform k-medians clustering on the Training data.This model is used to predict the clusters for Test data.

library(flexclust)
set.seed(12)
#Considering k=4
#To iterate and train the model 15 times
for(i in 1:15)
{
  kMedian = kcca(trainData1, k=4, kccaFamily("kmedians"),save.data = TRUE)
}
kMedianValues = parameters(kMedian)
#To print the median value of each cluster
print(kMedianValues)
##      Health Changing.the.past New.environment Public.speaking
## [1,]      3                 3               4               4
## [2,]      3                 3               3               4
## [3,]      3                 3               4               2
## [4,]      4                 3               3               4
##      Interests.or.hobbies Storm Darkness Spiders Fear.of.public.speaking
## [1,]                    4     1        1       2                       3
## [2,]                    3     1        2       2                       3
## [3,]                    4     1        2       3                       2
## [4,]                    3     3        3       5                       3
##      Cars Active.sport Adrenaline.sports Romantic Western Action Happy
## [1,]    3            5                 4        3       3      4     1
## [2,]    2            2                 1        3       1      3     0
## [3,]    4            4                 4        4       2      4     1
## [4,]    2            3                 3        4       1      3     0
kMedianTrainClusters<-clusters(kMedian)
#Plotting the clusters
clusplot(trainData1,kMedianTrainClusters,main = paste("CLUSPLOT For K Medians(k=4)"))

predClusterMedian<- predict(kMedian, newdata=testData1, k=4, kccaFamily("kmedians"))

From the above predicted clusters, we observe that clusters 1,3 have median values 0 and clusters 2,4 have median value as 1. Therefore we recode the cluster values as the median values in order to compare with the actual Happy values.

resultsDF$predictedKMedian = predClusterMedian
resultsDF$predictedKMedian[resultsDF$predictedKMedian == 2| resultsDF$predictedKMedian  == 4] = 0
resultsDF$predictedKMedian[resultsDF$predictedKMedian == 1| resultsDF$predictedKMedian  == 3] = 1
#Computing a confusion matrix of the predicted and actual data
resultsList$KMedian <- confusionMatrix(factor(resultsDF$predictedKMedian,levels = 0:1),factor(testData1$Happy ,levels = 0:1),positive = '1',mode="prec_recall")
print(resultsList$KMedian)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 62 34
##          1 28 74
##                                           
##                Accuracy : 0.6869          
##                  95% CI : (0.6173, 0.7507)
##     No Information Rate : 0.5455          
##     P-Value [Acc > NIR] : 3.407e-05       
##                                           
##                   Kappa : 0.372           
##  Mcnemar's Test P-Value : 0.5254          
##                                           
##               Precision : 0.7255          
##                  Recall : 0.6852          
##                      F1 : 0.7048          
##              Prevalence : 0.5455          
##          Detection Rate : 0.3737          
##    Detection Prevalence : 0.5152          
##       Balanced Accuracy : 0.6870          
##                                           
##        'Positive' Class : 1               
## 

Fuzzy c-means(FCM)

Fuzzy c-means(soft clustering or soft k-means) is a clustering technique which allows each observation to belong to more than one cluster. Each observation is assigned a membership grade for each cluster it belongs to. We partition into 3 clusters.

# set.seed(1)
library(cluster)
library(e1071)
cmeansCluster <- cmeans(trainData1,centers = 3 ,iter.max = 100, verbose = FALSE,dist = "manhattan", method = "cmeans", m = 2,rate.par = NULL, weights = 1, control = list())
print(cmeansCluster$centers)
##   Health Changing.the.past New.environment Public.speaking
## 1      3                 3               4               4
## 2      3                 3               4               3
## 3      3                 3               3               4
##   Interests.or.hobbies Storm Darkness Spiders Fear.of.public.speaking Cars
## 1                    4     1        2       2                       3    3
## 2                    4     2        2       2                       3    3
## 3                    3     2        2       3                       3    2
##   Active.sport Adrenaline.sports Romantic Western Action Happy
## 1            4                 3        3       2      4     1
## 2            4                 3        3       2      4     1
## 3            3                 3        4       2      3     0
clusplot(trainData1,cmeansCluster$cluster,main="CLUSPLOT for Fuzzy C-Means with 3 clusters")

resultsDF$PredictedCmeans <- cl_predict(cmeansCluster,testData1,type = "class_ids")
resultsDF$PredictedCmeans[resultsDF$PredictedCmeans == 3] = 0
resultsDF$PredictedCmeans[resultsDF$PredictedCmeans == 1| resultsDF$PredictedCmeans == 2] = 1
resultsList$Cmeans <- confusionMatrix(factor(actualData1$Happy,levels = 0:1),factor(resultsDF$PredictedCmeans,levels = 0:1),positive = '1',mode="prec_recall")
print(resultsList$Cmeans)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 56 34
##          1 21 87
##                                           
##                Accuracy : 0.7222          
##                  95% CI : (0.6543, 0.7834)
##     No Information Rate : 0.6111          
##     P-Value [Acc > NIR] : 0.0006932       
##                                           
##                   Kappa : 0.433           
##  Mcnemar's Test P-Value : 0.1056454       
##                                           
##               Precision : 0.8056          
##                  Recall : 0.7190          
##                      F1 : 0.7598          
##              Prevalence : 0.6111          
##          Detection Rate : 0.4394          
##    Detection Prevalence : 0.5455          
##       Balanced Accuracy : 0.7231          
##                                           
##        'Positive' Class : 1               
## 

DECISION TREE

Decision Tree creates a model that predicts the value of target variable by learning simple decision rules inferred from the features.We have implemented the CART and C5.0(an extension of ID3 algorithm) algorithms for decision tree.

#Decision Tree using CART method
set.seed(1)
library(rpart)
library(rpart.plot)

fit <- rpart(Happy~., data = trainData1, method = 'class')
rpart.plot(fit, extra = 101,main="Decision Tree using CART method")

predictedVal <-predict(fit, testData1, type = 'class')

tableMat <- table(testData1$Happy, predictedVal)
resultsList$DecTreeCart <- confusionMatrix(tableMat,positive = '1',mode="prec_recall")

C5.0 is an extension of ID3 algorithm to generate a decision tree and uses normalized information gain as the criteria to split the samples into a class.

#Decision Tree using C5.0
library(C50)
set.seed(123)
formulaStrDt <- paste(names(trainData1[c(1:2,5,9,13)]), collapse='+')
formulaStrDt <- paste("factor(Happy) ~",formulaStrDt)
model <- C5.0(as.formula(formulaStrDt),data=trainData1)
plot(model,main="Decision Tree for C5.0 method")

resultsDF$predictedDTC5 = predict(model,testData1)
resultsList$DecTreeC50 <- confusionMatrix(factor(resultsDF$predictedDTC5),factor(testData1$Happy),positive = '1',mode="prec_recall")
print(resultsList$DecTreeC50)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 63 33
##          1 27 75
##                                           
##                Accuracy : 0.697           
##                  95% CI : (0.6278, 0.7601)
##     No Information Rate : 0.5455          
##     P-Value [Acc > NIR] : 9.344e-06       
##                                           
##                   Kappa : 0.3923          
##  Mcnemar's Test P-Value : 0.5186          
##                                           
##               Precision : 0.7353          
##                  Recall : 0.6944          
##                      F1 : 0.7143          
##              Prevalence : 0.5455          
##          Detection Rate : 0.3788          
##    Detection Prevalence : 0.5152          
##       Balanced Accuracy : 0.6972          
##                                           
##        'Positive' Class : 1               
## 

RANDOM FOREST

Random Forest is an ensemble classification method that generates multiple decision trees and outputs the mode of the classes.

#Decision tree using Random Forest
library(randomForest)
set.seed(1)
rf = randomForest(factor(Happy)~.,data = trainData1,ntree=750)
plot(rf,main="Error plot for Random Forest")

varImpPlot(rf,main="Importance of variables")

resultsDF$predictedRf = predict(rf,testData1)
resultsList$DecTreeRF <- confusionMatrix(factor(resultsDF$predictedRf),factor(testData1$Happy),positive = '1',mode="prec_recall")
print(resultsList$DecTreeRF)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 64 27
##          1 26 81
##                                           
##                Accuracy : 0.7323          
##                  95% CI : (0.6649, 0.7926)
##     No Information Rate : 0.5455          
##     P-Value [Acc > NIR] : 4.834e-08       
##                                           
##                   Kappa : 0.4607          
##  Mcnemar's Test P-Value : 1               
##                                           
##               Precision : 0.7570          
##                  Recall : 0.7500          
##                      F1 : 0.7535          
##              Prevalence : 0.5455          
##          Detection Rate : 0.4091          
##    Detection Prevalence : 0.5404          
##       Balanced Accuracy : 0.7306          
##                                           
##        'Positive' Class : 1               
## 

SUPPORT VECTOR MACHINES

It is a supervised machine learning algorithm which plots the data points in an n-dimensional space where n is the number of features and the value of a particular coordinate is the value of the feature.

SVM linear method

#SVM linear
svmLinearModel = svm(Happy~.,data = trainData1,type="C-classification",kernel="linear")
resultsDF$predictedSvmLinear = predict(svmLinearModel,testData1)
resultsList$svmLin <- confusionMatrix(factor(resultsDF$predictedSvmLinear),factor(testData1$Happy),positive = '1',mode="prec_recall")
print(resultsList$svmLin)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 60 28
##          1 30 80
##                                           
##                Accuracy : 0.7071          
##                  95% CI : (0.6384, 0.7694)
##     No Information Rate : 0.5455          
##     P-Value [Acc > NIR] : 2.338e-06       
##                                           
##                   Kappa : 0.4082          
##  Mcnemar's Test P-Value : 0.8955          
##                                           
##               Precision : 0.7273          
##                  Recall : 0.7407          
##                      F1 : 0.7339          
##              Prevalence : 0.5455          
##          Detection Rate : 0.4040          
##    Detection Prevalence : 0.5556          
##       Balanced Accuracy : 0.7037          
##                                           
##        'Positive' Class : 1               
## 
plot(svmLinearModel,data=trainData1,Changing.the.past ~ Interests.or.hobbies) 

SVM polynomial method

#SVM polynomial

svmPolyModel = svm(Happy~.,data = trainData1,type="C-classification",kernel="polynomial")
resultsDF$predictedSvmPoly = predict(svmPolyModel,testData1)
resultsList$svmPoly <- confusionMatrix(factor(resultsDF$predictedSvmPoly),factor(testData1$Happy),positive = '1',mode="prec_recall")
print(resultsList$svmPoly)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 55 23
##          1 35 85
##                                           
##                Accuracy : 0.7071          
##                  95% CI : (0.6384, 0.7694)
##     No Information Rate : 0.5455          
##     P-Value [Acc > NIR] : 2.338e-06       
##                                           
##                   Kappa : 0.4026          
##  Mcnemar's Test P-Value : 0.1486          
##                                           
##               Precision : 0.7083          
##                  Recall : 0.7870          
##                      F1 : 0.7456          
##              Prevalence : 0.5455          
##          Detection Rate : 0.4293          
##    Detection Prevalence : 0.6061          
##       Balanced Accuracy : 0.6991          
##                                           
##        'Positive' Class : 1               
## 
plot(svmPolyModel,data=trainData1,Changing.the.past ~ Interests.or.hobbies) 

SVM radial method

#SVM radial 

svmRadialModel = svm(Happy~.,data = trainData1,type="C-classification",kernel="radial")
resultsDF$predictedSvmRadial = predict(svmRadialModel,testData1)
resultsList$svmRadial <- confusionMatrix(factor(resultsDF$predictedSvmRadial),factor(testData1$Happy),positive = '1',mode="prec_recall")
print(resultsList$svmRadial)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 65 30
##          1 25 78
##                                           
##                Accuracy : 0.7222          
##                  95% CI : (0.6543, 0.7834)
##     No Information Rate : 0.5455          
##     P-Value [Acc > NIR] : 2.453e-07       
##                                           
##                   Kappa : 0.4424          
##  Mcnemar's Test P-Value : 0.5896          
##                                           
##               Precision : 0.7573          
##                  Recall : 0.7222          
##                      F1 : 0.7393          
##              Prevalence : 0.5455          
##          Detection Rate : 0.3939          
##    Detection Prevalence : 0.5202          
##       Balanced Accuracy : 0.7222          
##                                           
##        'Positive' Class : 1               
## 
plot(svmPolyModel,data=trainData1,Changing.the.past ~ Interests.or.hobbies) 

resultListNames <- names(resultsList)
for(i in c(1:11)){
  print(resultListNames[i])  
  print(resultsList[[i]]$overall["Accuracy"])
  print(resultsList[[i]]$byClass["F1"])
}
## [1] "Kmeans"
##  Accuracy 
## 0.6818182 
##        F1 
## 0.6735751 
## [1] "Hierarchical"
##  Accuracy 
## 0.6313131 
##        F1 
## 0.6507177 
## [1] "KMedoid"
##  Accuracy 
## 0.6767677 
##        F1 
## 0.7355372 
## [1] "KMedian"
##  Accuracy 
## 0.6868687 
##        F1 
## 0.7047619 
## [1] "Cmeans"
##  Accuracy 
## 0.7222222 
##        F1 
## 0.7598253 
## [1] "DecTreeCart"
##  Accuracy 
## 0.6616162 
##       F1 
## 0.685446 
## [1] "DecTreeC50"
##  Accuracy 
## 0.6969697 
##        F1 
## 0.7142857 
## [1] "DecTreeRF"
##  Accuracy 
## 0.7323232 
##        F1 
## 0.7534884 
## [1] "svmLin"
##  Accuracy 
## 0.7070707 
##       F1 
## 0.733945 
## [1] "svmPoly"
##  Accuracy 
## 0.7070707 
##       F1 
## 0.745614 
## [1] "svmRadial"
##  Accuracy 
## 0.7222222 
##        F1 
## 0.7393365

ENSEMBLE CLASSIFICATION

Majority vote: It is defined as taking the prediction with maximum vote / recommendation from multiple models predictions while predicting the outcomes of a classification problem.

determineMajority <- function(resDF,n){
  for(i in 1:nrow(resDF)) 
  {
    if(rowSums(resDF[i,]) >= n) {
      resDF$PredictedVote[i] = 1
    }
    else{
      resDF$PredictedVote[i] = 0
    }
  }
  return (resDF$PredictedVote)
}
resultsDF[] <- lapply(resultsDF, function(x) as.numeric(as.character(x)))
resultsDF <- as.data.frame(resultsDF)
resultsDF$PredictedVote = 1:nrow(resultsDF)
resultsDF$PredictedClusterVote = determineMajority(resultsDF[c(2,3:5)],3)
resultsDF$PredictedVote = determineMajority(resultsDF[c(2,3,5,8,11)],3)
confusionMatrix(factor(actualData1$Happy,levels = 0:1),factor(resultsDF$PredictedClusterVote,levels = 0:1),positive = '1',mode="prec_recall")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 56 34
##          1 19 89
##                                           
##                Accuracy : 0.7323          
##                  95% CI : (0.6649, 0.7926)
##     No Information Rate : 0.6212          
##     P-Value [Acc > NIR] : 0.0006429       
##                                           
##                   Kappa : 0.4526          
##  Mcnemar's Test P-Value : 0.0544740       
##                                           
##               Precision : 0.8241          
##                  Recall : 0.7236          
##                      F1 : 0.7706          
##              Prevalence : 0.6212          
##          Detection Rate : 0.4495          
##    Detection Prevalence : 0.5455          
##       Balanced Accuracy : 0.7351          
##                                           
##        'Positive' Class : 1               
## 
confusionMatrix(factor(actualData1$Happy,levels = 0:1),factor(resultsDF$PredictedVote,levels = 0:1),positive = '1',mode="prec_recall")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 56 34
##          1 21 87
##                                           
##                Accuracy : 0.7222          
##                  95% CI : (0.6543, 0.7834)
##     No Information Rate : 0.6111          
##     P-Value [Acc > NIR] : 0.0006932       
##                                           
##                   Kappa : 0.433           
##  Mcnemar's Test P-Value : 0.1056454       
##                                           
##               Precision : 0.8056          
##                  Recall : 0.7190          
##                      F1 : 0.7598          
##              Prevalence : 0.6111          
##          Detection Rate : 0.4394          
##    Detection Prevalence : 0.5455          
##       Balanced Accuracy : 0.7231          
##                                           
##        'Positive' Class : 1               
## 

Weighted average: In this, different weights are applied to predictions from multiple models then taking the average which means giving high or low importance to specific model output. Assigning weights based on the accuracies of the models. wAverageDetermination-This function is used to pass each row of the required columns from the data to the wAverageRow function. wAverageRow function takes each row of data passed to it and averages based on the weights assigned to different algorithms.

importantResults <- resultsDF[c(2,3,4,5,6,8,11)]
corrMatrix <- cor(importantResults)
corrplot(corrMatrix,method = "number",title = "Correlation Between Different Models",mar = c(0,0,2,0))

corrplot(cov(importantResults),method = "number",title = "Covariance Between Different Models",mar = c(0,0,2,0))

From the above results we observe there is no significant correlation or covariance between two models.Hence considering all of them is essential.The weights can hence be assigned according to the accuracies of the different models. We observe a significant correlation between Kmeans and Kmedian proving the similarity of the methods altogether. We consider the other algorithms with following weights

# table(resultsDF)
wAverageRow <- function(row){
  weights <- c(3,1,3,3,3,7)/20
  return(weighted.mean(row,weights))
}
wAverageDetermination <- function(){
  for(i in 1:nrow(resultsDF)) 
  {
    resultsDF$PredictedWA[i] = wAverageRow(resultsDF[i,][c(2,3,4,8,11,6)])
    if(resultsDF$PredictedWA[i] >= 0.5){
      resultsDF$PredictedVote[i] = 1
    }
    else{
      resultsDF$PredictedVote[i] = 0
    }
  }
  return (resultsDF$PredictedWA)
}
resultsDF$PredictedWA = 1:nrow(resultsDF)
resultsDF$PredictedWA = wAverageDetermination()
confusionMatrix(factor(actualData1$Happy,levels = 0:1),factor(resultsDF$PredictedWA,levels = 0:1),positive = '1',mode="prec_recall")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 23  5
##          1  4 35
##                                           
##                Accuracy : 0.8657          
##                  95% CI : (0.7603, 0.9367)
##     No Information Rate : 0.597           
##     P-Value [Acc > NIR] : 1.559e-06       
##                                           
##                   Kappa : 0.7225          
##  Mcnemar's Test P-Value : 1               
##                                           
##               Precision : 0.8974          
##                  Recall : 0.8750          
##                      F1 : 0.8861          
##              Prevalence : 0.5970          
##          Detection Rate : 0.5224          
##    Detection Prevalence : 0.5821          
##       Balanced Accuracy : 0.8634          
##                                           
##        'Positive' Class : 1               
## 
# hist(resultsDF$PredictedWA)